knitr::opts_chunk$set(
  warning = TRUE, # show warnings during codebook generation
  message = TRUE, # show messages during codebook generation
  error = TRUE, # do not interrupt codebook generation in case of errors,
                # usually better for debugging
  echo = TRUE  # show R code
)
ggplot2::theme_set(ggplot2::theme_bw())
pander::panderOptions("table.split.table", Inf)
# load libraries
library(codebook)
library(here)
library(dplyr)
library(tidyverse)
library(future)
library(labelled)

This is a data dictionary for the data set used in the paper “Are translation equivalents special? Evidence from simulations and empirical data from bilingual infants”.

#load dataset
keepers_ws_TE <- rio::import(here::here("data_keepers/keepers_ws_TE_final.csv"))

Variables

Overview

codebook_items(keepers_ws_TE)

Codebook table

baby_id & ID_testdate

  • baby_id: Unique participant ID
  • ID_testdate: Unique participant ID + date of test. Some participants made multiple visits; therefore, certain unique participant IDs have multiple rows of data.

The dataset has N = 200 participants and 229 datapoints.

keepers_ws_TE %>% 
  summarise(N_baby_id = n_distinct(baby_id),
            N_ID_testdate = n_distinct(ID_testdate))
##   N_baby_id N_ID_testdate
## 1       200           229

visit_num

  • visit_num: Data collected 1st, 2nd or 3rd visit to the lab. Most children come to lab only once, some come back at a later age
  • multiple_visits: Who came once or came multiple times. Values: Single visit = FALSE, multiple visits = TRUE

While the majority of participants made only 1 visit, some participants made multiple visits.

keepers_ws_TE %>% 
  group_by(multiple_visits) %>%
  distinct(baby_id, .keep.all=T) %>% 
  count()
## # A tibble: 2 x 2
## # Groups:   multiple_visits [2]
##   multiple_visits     n
##   <lgl>           <int>
## 1 FALSE             156
## 2 TRUE               44
keepers_ws_TE %>% 
  group_by(visit_num) %>% 
  distinct(ID_testdate) %>%
  mutate(total = n()) %>% 
  count()
## # A tibble: 3 x 2
## # Groups:   visit_num [3]
##   visit_num     n
##       <int> <int>
## 1         1   189
## 2         2    38
## 3         3     2

gender

  • gender: categorical with 2 levels: F & M
keepers_ws_TE %>%
  distinct(baby_id, .keep_all = TRUE) %>% 
  count(gender) %>%
  mutate(percentage = round(n/sum(n)*100, 2)) 
##   gender   n percentage
## 1      F  94         47
## 2      M 106         53

age

  • age_days: age in days
  • age_continuous: age in months (with decimal)
  • age_months_binned: age in months (binned; without decimal)
keepers_ws_TE %>%
  summarize(mean_age_continuous = mean(age_continuous, na.rm = T),
            sd_age_continuous = sd(age_continuous, na.rm = T),
            min_age_continuous = min(age_continuous, na.rm = T),
            max_age_continuous = max(age_continuous, na.rm = T))
##   mean_age_continuous sd_age_continuous min_age_continuous max_age_continuous
## 1            24.40808           4.70224              18.38               33.5
keepers_ws_TE %>% 
  ggplot(aes(age_continuous)) +
  geom_histogram()

years_education

  • years_education: Maternal education in years
keepers_ws_TE %>% 
  summarize(mean_years_education = mean(years_education, na.rm = T),
            sd_years_education = sd(years_education, na.rm = T),
            min_years_education = min(years_education, na.rm = T),
            max_years_education = max(years_education, na.rm = T))
##   mean_years_education sd_years_education min_years_education
## 1             16.59633           2.136448                  10
##   max_years_education
## 1                  21
keepers_ws_TE %>% 
  ggplot(aes(years_education)) +
  geom_histogram()

lang_group

All the data points included in the analysis are from bilinguals.

keepers_ws_TE %>%
  mutate(lang_group = as.factor(lang_group)) %>%
  count(lang_group) %>%
  mutate(percentage = round(n/sum(n)*100, 2)) 
##   lang_group   n percentage
## 1  bilingual 229        100

lang_exp

  • lang_exp_eng: % of the time when a participant is exposed to English over the course of life globally
  • lang_exp_fre: % of the time when a participant is exposed to French over the course of life globally
  • lang_exp_other: % of the time when a participant is exposed to a third language over the course of life globally
keepers_ws_TE %>%
  pivot_longer(c(lang_exp_eng, lang_exp_fre, lang_exp_other), names_to = "language", values_to = "lang_exp") %>%
  group_by(language) %>%
  summarize(mean_lang_exp = mean(lang_exp, na.rm = T),
            sd_lang_exp = sd(lang_exp, na.rm = T),
            min_lang_exp = min(lang_exp, na.rm = T),
            max_lang_exp = max(lang_exp, na.rm = T))
## # A tibble: 3 x 5
##   language       mean_lang_exp sd_lang_exp min_lang_exp max_lang_exp
##   <chr>                  <dbl>       <dbl>        <dbl>        <dbl>
## 1 lang_exp_eng          51.7         14.8            25           75
## 2 lang_exp_fre          47.8         15.0            25           75
## 3 lang_exp_other         0.565        1.78            0           10
keepers_ws_TE %>%
  pivot_longer(c(lang_exp_eng, lang_exp_fre, lang_exp_other), names_to = "language", values_to = "lang_exp") %>%
  group_by(language) %>%
  summarize(mean_lang_exp = mean(lang_exp, na.rm = T)) %>% 
  mutate(language = recode(language, 
                           lang_exp_eng = "English", 
                           lang_exp_fre = "French",
                           lang_exp_other = "Other language")) %>%
  ggplot(aes(x = language, y = mean_lang_exp)) +
  geom_bar(stat="identity") +
  labs(x = "Language", 
       y = "Mean language exposure (%)")

lang_dom

  • lang_dom: Language dominance based on % language exposure (i.e., the language with a greater exposure is the dominant language)
  • lang_dom_vocab: Language dominance based on vocabulary size (i.e., the language with a greater vocabulary size is the dominant language)
keepers_ws_TE %>%
  pivot_longer(c(lang_dom, lang_dom_vocab), names_to = "language_dominance", values_to = "language") %>%
  group_by(language_dominance, language) %>%
  summarize(n = n())
## # A tibble: 4 x 3
## # Groups:   language_dominance [2]
##   language_dominance language     n
##   <chr>              <chr>    <int>
## 1 lang_dom           English    127
## 2 lang_dom           French     102
## 3 lang_dom_vocab     English    137
## 4 lang_dom_vocab     French      92
keepers_ws_TE %>%
  pivot_longer(c(lang_dom, lang_dom_vocab), names_to = "language_dominance", values_to = "language") %>%
  group_by(language_dominance, language) %>%
  summarize(n = n()) %>%
  mutate(language_dominance = recode(language_dominance, 
                           lang_dom = "Language dominance \n based on % language exposure", 
                           lang_dom_vocab = "Language dominance \n based on vocabulary size")) %>%
  ggplot(aes(x = language_dominance, y = n, fill = language)) +
  geom_bar(stat="identity", position=position_dodge()) +
  labs(x = "Types of language dominance", 
       y = "Number of participants")

For most children, the language in which they produced the most words was also the language that they heard most often, although this was not the case for some children.

## check to see how many children have consistent/inconsistent dominant language between vocabulary-defined and input-defined dominance
keepers_ws_TE %>%
  mutate(consistent_lang_dom = if_else(lang_dom != lang_dom_vocab, 0, 1)) %>% # 1= consistent, 0 = inconsistent
  summarise(n = n(),
            n_consistent = sum(consistent_lang_dom),
            percentage_consistent = n_consistent/n*100,
            n_inconsistent = n - n_consistent,
            percentage_inconsistent = n_inconsistent/n*100)
##     n n_consistent percentage_consistent n_inconsistent percentage_inconsistent
## 1 229          181               79.0393             48                 20.9607

Thus, these two constructs were related, although not identical.

keepers_ws_TE %>%
  ggplot(aes(x = balance_vocab, y = lang_nondom_input)) +
  stat_smooth(method = lm, se = F, color = "black") +
  geom_point(shape = 1) + 
  theme_light() + 
  labs(x = "Balance based on vocabulary (BALANCE)", 
       y = "Balance based on exposure") 

cdi_filled

  • cdi_filled_by: Who filled out CDI? Due to limitations with the platform we used to enter CDI data, this information is provided for each visits (e.g, if mother filled English CDI and father filled French, the answer is Mother and Father, without specifying which CDI was filled by whom)">Who filled out CDI? Due to limitations with the platform we used to enter CDI data, this information is provided for each visits (e.g, if mother filled English CDI and father filled French, the answer is Mother and Father, without specifying which CDI was filled by whom)
  • both_cdi_filled: Are both CDIs filled? (Y for yes; N for N)
  • required_cdi_filled: Is the required CDI filled? Required for bilinguals: CDI in English AND French. (Y for yes; N for N)
  • eng_cdi_filled: Is the English CDI filled?
  • fre_cdi_filled: Is the French CDI filled?
  • dom_cdi_filled: Is the CDI in the dominant language (defined by exposure: lang_dom) filled?
  • cdi_available: Is the CDI available for both languages? (bothFilled = both English and French filled)

Who filled out the CDIs?

keepers_ws_TE %>% 
  # combine grandmother & other family member
  mutate(cdi_filled_by = replace(cdi_filled_by, cdi_filled_by == "Grandmother", "Other family member")) %>% 
  # count number of respondents
  count(cdi_filled_by) %>%
  mutate(percentage = round(n/sum(n)*100, 2)) 
##         cdi_filled_by   n percentage
## 1              Father  15       6.55
## 2              Mother 146      63.76
## 3   Mother and Father  10       4.37
## 4 Other family member   2       0.87
## 5                <NA>  56      24.45

Are both English and French CDIs filled?

keepers_ws_TE %>% 
  pivot_longer(c(both_cdi_filled, required_cdi_filled, eng_cdi_filled, fre_cdi_filled, dom_cdi_filled, cdi_available),
               names_to = "variable", values_to = "response") %>%
  group_by(variable, response) %>%
  summarize(n = n()) %>%
  mutate(percentage = round(n/sum(n)*100, 2)) 
## # A tibble: 6 x 4
## # Groups:   variable [6]
##   variable            response       n percentage
##   <chr>               <chr>      <int>      <dbl>
## 1 both_cdi_filled     Y            229        100
## 2 cdi_available       bothFilled   229        100
## 3 dom_cdi_filled      Y            229        100
## 4 eng_cdi_filled      Y            229        100
## 5 fre_cdi_filled      Y            229        100
## 6 required_cdi_filled Y            229        100

vocabulary measures

Types of vocabulary

  • total_words_eng: Total number of words in English
  • total_words_fre: Total number of words in French
  • word_vocab: Total word vocabulary (= total_words_eng + total_words_fre)
  • concept_vocab: Total concept vocabulary
  • number_of_te: Total number of translation equivalents (TEs), i.e., words produced by one child in both English and French)"
  • eng_unique_words: Total number of words in English CDI minus number of TEs. Number of words produced only in English (not in French)
  • fre_unique_words: Total number of words in French CDI minus number of TEs. Number of words produced only in French (not in English)
keepers_ws_TE %>% 
  pivot_longer(c(total_words_eng, total_words_fre, word_vocab, concept_vocab, number_of_te, eng_unique_words, fre_unique_words),
        names_to = "vocab_type", values_to = "vocab_score") %>%
  group_by(vocab_type) %>%
  summarize(mean = mean(vocab_score, na.rm = T),
            sd = sd(vocab_score, na.rm = T),
            min = min(vocab_score, na.rm = T),
            max = max(vocab_score, na.rm = T))
## # A tibble: 7 x 5
##   vocab_type        mean    sd   min   max
##   <chr>            <dbl> <dbl> <int> <int>
## 1 concept_vocab    227.  181.      4   695
## 2 eng_unique_words  98.5 125.      1   523
## 3 fre_unique_words  61.2  80.6     0   399
## 4 number_of_te      67.7  85.1     1   409
## 5 total_words_eng  166.  177.      3   657
## 6 total_words_fre  129.  124.      2   532
## 7 word_vocab       295.  255.      6  1071

Derived variables

Instead of coding total number of words produced in English/French, codes for total number of words produced in dominant/non-dominant language defined by vocabulary size (i.e., lang_dom_vocab: the language with a greater vocabulary size is the dominant language)

  • total_words_dom: Total number of words in the dominant language
  • total_words_nondom: Total number of words in the non-dominant language
  • total_singlet_dom: Total number of words in the dominant language minus number of TEs. Number of words produced only in the dominant language.
  • total_singlet_nondom: Total number of words in the non-dominant language minus number of TEs. Number of words produced only in the non-dominant language
  • singlet_vocab: Total singlet vocabulary (total_singlet_dom + total_singlet_nondom)
keepers_ws_TE %>% 
  pivot_longer(c(total_words_dom, total_words_nondom, total_singlet_dom, total_singlet_nondom, singlet_vocab),
        names_to = "vocab_type", values_to = "vocab_score") %>%
  group_by(vocab_type) %>%
  summarize(mean = mean(vocab_score, na.rm = T),
            sd = sd(vocab_score, na.rm = T),
            min = min(vocab_score, na.rm = T),
            max = max(vocab_score, na.rm = T))
## # A tibble: 5 x 5
##   vocab_type            mean    sd   min   max
##   <chr>                <dbl> <dbl> <int> <int>
## 1 singlet_vocab        160.  124.      2   525
## 2 total_singlet_dom    138.  124.      2   523
## 3 total_singlet_nondom  21.2  20.1     0    94
## 4 total_words_dom      206.  176.      4   657
## 5 total_words_nondom    88.9  98.5     2   469

balance measures

Defined by vocabulary size

  • balance_vocab: The main balance measure we used in the current study is a Vocabulary Balance, which is determined based on the proportion of words produced in the non-dominant language relative to the total words produced across both languages. The formula for calculating the vocabulary balance score is total_words_dom/(total_words_dom+total_words_nondom).
keepers_ws_TE %>% 
  summarize(mean_balance_vocab = mean(balance_vocab, na.rm = T),
            sd_balance_vocab = sd(balance_vocab, na.rm = T),
            min_balance_vocab = min(balance_vocab, na.rm = T),
            max_balance_vocab = max(balance_vocab, na.rm = T))
##   mean_balance_vocab sd_balance_vocab min_balance_vocab max_balance_vocab
## 1          0.3083577        0.1250181        0.02380952          0.496063
keepers_ws_TE %>% 
  ggplot(aes(balance_vocab)) +
  geom_histogram()

Defined by exposure

Balance can also be considered in terms of input in each language. To make balance_vocab and balance_input comparable, the language designated as DOM and NONDOM was based on vocabulary-defined dominance (lang_dom_vocab), rather than the language that children heard most and least often.

  • lang_dom_input: % of the time when a participant is exposed to the dominant language over the course of life globally
  • lang_nondom_input: % of the time when a participant is exposed to the non-dominant language over the course of life globally
  • balance_input: The formula for calculating the input balance score is also NONDOM/(DOM+NONDOM).
keepers_ws_TE %>% 
  summarize(mean_balance_input = mean(balance_vocab, na.rm = T),
            sd_balance_input = sd(balance_vocab, na.rm = T),
            min_balance_input = min(balance_vocab, na.rm = T),
            max_balance_input = max(balance_vocab, na.rm = T))
##   mean_balance_input sd_balance_input min_balance_input max_balance_input
## 1          0.3083577        0.1250181        0.02380952          0.496063
keepers_ws_TE %>% 
  ggplot(aes(balance_input)) +
  geom_histogram()

Wordbank percentile measures

  • age_months_percentile: Instead of their actual age in months, this variable adjusts for the age range available in the Wordbank CDI-WS data. As the upper age limit of the CDI-WS is 30m, ages between 30m to 33m are all changed to 30m.
  • EngWS_90percentile: The number of English words children can produce at the 90th percentile across ages obtained from Wordbank.
  • FrWS_90percentile: The number of French words children can produce at the 90th percentile across ages obtained from Wordbank.
keepers_ws_TE %>%
  distinct(age_months_percentile, .keep_all = TRUE) %>%
  pivot_longer(c(EngWS_90percentile, FrWS_90percentile), names_to = "language", values_to = "words_90percentile") %>%
  mutate(language = recode(language, 
                           EngWS_90percentile = "English",
                           FrWS_90percentile = "French")) %>%
  ggplot(aes(x = age_months_percentile, y = words_90percentile, fill = language)) +
  geom_bar(stat="identity", position=position_dodge()) +
  labs(x = "Age in months adjusted for the CDI-WS age range (18-30m)", 
       y = "Number of words produced at the 90th percentile \n (obtained from Wordbank)")